session info:
sessionInfo()## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.29 R6_2.5.1 jsonlite_1.8.0 magrittr_2.0.3
## [5] evaluate_0.15 stringi_1.7.8 rlang_1.0.6 cli_3.4.1
## [9] data.table_1.14.2 rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1
## [13] rmarkdown_2.14 tools_4.2.0 stringr_1.4.1 xfun_0.31
## [17] yaml_2.3.5 fastmap_1.1.0 compiler_4.2.0 htmltools_0.5.2
## [21] knitr_1.39 sass_0.4.1
Install and load packages
# install CRAN packages (if not yet installed)
sapply(c("data.table", "tidyverse", "devtools", "readxl", "kableExtra", "ngram", "networkD3", "igraph", "network", "patchwork", "koRpus", "pbapply", "tidytext", "cluster", "ggrepel", "animation", "vroom", "ggrepel", "Rtsne", "DT"), function(x) if(!is.element(x, installed.packages())) install.packages(x, dependencies = T, repos = "http://cran.us.r-project.org"))## $data.table
## NULL
##
## $tidyverse
## NULL
##
## $devtools
## NULL
##
## $readxl
## NULL
##
## $kableExtra
## NULL
##
## $ngram
## NULL
##
## $networkD3
## NULL
##
## $igraph
## NULL
##
## $network
## NULL
##
## $patchwork
## NULL
##
## $koRpus
## NULL
##
## $pbapply
## NULL
##
## $tidytext
## NULL
##
## $cluster
## NULL
##
## $ggrepel
## NULL
##
## $animation
## NULL
##
## $vroom
## NULL
##
## $ggrepel
## NULL
##
## $Rtsne
## NULL
##
## $DT
## NULL
# install non-CRAN packages (if not yet installed)
if(!is.element("concordances", installed.packages())) {
devtools::install_github("hartmast/concordances")
}
# if this doesn't work, check sfla.ch for the package
if(!is.element("collostructions", installed.packages())) {
install.packages("https://sfla.ch/wp-content/uploads/2021/02/collostructions_0.2.0.tar.gz", repos = NULL)
}
# install "concordances" if not yet installed
if(!is.element("concordances", installed.packages())) {
devtools::install_github("hartmast/concordances", ref = "f4ca785
")
}
# install "wordVectors" if not yet installed
if(!is.element("wordVectors", installed.packages())) {
devtools::install_github("bmschmidt/wordVectors")
}
# load "concordances"
library(concordances)
library(tidyverse)
library(readxl)
library(data.table)
library(kableExtra)
library(collostructions)
library(wordVectors)
library(vroom)
library(ggrepel)
library(cluster)
library(patchwork)
library(DT)The data were retrieved from the NoSketchEngine instance of the COW corpora (https://www.webcorpora.org/) on Sept 27, 2019, using the ENCOW16B “World Englishes” corpus. The query was:
[word="[Tt]he"] "mother" "of" "all" []The data were exported to the XML file mother_of_all.xml, which is imported in the subsequent step.
d <- getNSE("../data/mother_of_all.xml", xml = TRUE, context_tags = FALSE, verbose = FALSE)The concordance is exported for annotation, the annotated file is then read in.
# write_excel_csv(d, "mother_of_all_ENCOW.csv")
d <- read_xlsx("../data/mother_of_all_ENCOW.xlsx")We only keep the instances manually tagged as keep == “y”, excluding false hits and doubtful cases.
d <- filter(d, keep == "y")# frequency table
d_tbl <- d %>% select(lemma) %>% table %>% as_tibble() %>% rename(c(Freq = "n")) %>% arrange(desc(Freq))
# overview table
tibble(
# types
Types = nrow(d_tbl),
# tokens
Tokens = sum(d_tbl$Freq),
# hapax legomena
"Hapax Legomena" = length(which(d_tbl$Freq==1))
) %>% kbl() %>% kable_material(c("striped", "hover"))| Types | Tokens | Hapax Legomena |
|---|---|---|
| 1669 | 4127 | 1092 |
For performing a collostructional analysis, we have to know how often a lemma attested in the open slot of the mother of all construction occurs in the ENCOW corpus as a whole. For this purpose, we read in the ENCOW word list (available at webcorpora.org).
# not in repository,
# available at webcorpora.org after registration
# frequencies for relevant hits available at ../data/mother_of_all_with_encow_frequencies.csv
encow <- fread("/Volumes/My Passport/ENCOW word lists/encow16ax.lp.tsv",
header = F)
#
# head(encow)
#
# # only nouns
# encow <- encow[V2 %in% c("NN", "NE")]
#
# head(encow)This is a huge database, but we only need nouns and adjectives, so we drop the rest in order to speed up the next calculations. Also, we add more self-explanatory column names:
# only nouns
encow <- encow[V2 %in% c("NN", "NE")]
colnames(encow) <- c("Lemma", "POS", "Freq")
# if something is attested both as NN and
# NE, sum them up
encow <- encow[, sum(Freq), by = Lemma]
setnames(encow, old = "V1", new = "Freq")We compile a frequency table from our mother of all
concordance d and combine it with the corpus frequencies in
the encow table.
# get frequencies
d_tbl <- d %>% select(lemma) %>% table %>% sort(decreasing = T) %>%
as.data.frame(stringsAsFactors = F)
colnames(d_tbl) <- c("Lemma", "Freq_in_cxn")
# join tables
d_tbl <- left_join(d_tbl, encow, by = "Lemma")These data are not without noise - ENCOW’s lemmatization is of course not perfect, but we can still expect that the relationships between construction frequencies and corpus frequencies of the words in question are roughly representative of their “actual” relationship in everyday use, at least in the text types represented in the corpus. However, given that the lemmatization is not perfect, there can be cases where the corpus frequency is lower than the construction frequency. This is of course not possible - if a word occurs, say, three times in a specific construction in corpus, its total frequency in the corpus cannot be lower than three! In one instance, however, this is the case in our data. The reason for this is that our concordance was lemmatized manually while for ENCOW, we rely on the automatic lemmatization. As this only affects one single attestation, we just exclude it.
# omit one case where corpus frequency is
# smaller than cxn frequency
d_tbl <- subset(d_tbl, d_tbl[,2] <= d_tbl[,3])Next, we perform a collexeme analysis using Flach’s
collostructions package.
# sum(encow$Freq): 1805183579
# perform collexeme analysis
left_join(collex(as.data.frame(d_tbl), corpsize = 1805183579, delta.p = TRUE),
select(collex(as.data.frame(d_tbl), corpsize = 1805183579, am = "odds"), COLLEX, COLL.STR.ODDS)) %>% DT::datatable() #kbl() %>% ## Joining, by = "COLLEX"
#kable_material(c("striped", "hover")) %>% scroll_box(width = "800px", height = "200px")# read data
moa <- read_xlsx("../data/motherofall_COCA.xlsx")
# full frequency list cannot be shared publicly
# for license reasons, hence we work with the
# list containing only the lemmas occurring in
# mother of all
#coca <- fread("../coca_2017_lemma_frequency_list.txt", quote = "")
coca <- fread("../data/coca_moa_lemma_frequencies.csv")
# replace whitespaces in column names
colnames(moa) <- gsub(" ", "_", colnames(moa))
colnames(coca) <- c("No", "Lemma", "Freq")
# omit false hits
moa <- subset(moa, keep=="y")
# types, tokens, and hapax legomena overall
moa_tbl1 <- moa %>% select(lemma) %>% table %>% as_tibble() %>% rename(c(Freq = "n") ) %>% arrange(desc(Freq))
tibble(
Tokens = sum(moa_tbl1$Freq),
Types = nrow(moa_tbl1),
"Hapax Legomena" = length(which(moa_tbl1$Freq==1))
)# generate input for collostructional analysis
moa_lemmas <- moa$lemma %>% table %>% sort(decreasing = T) %>% as.data.frame(stringsAsFactors = F)
colnames(moa_lemmas) <- c("Lemma", "Freq_in_cxn")
all_lemmas <- coca[,Lemma, Freq]
setcolorder(all_lemmas, c("Lemma", "Freq"))
collex_input <- join.freqs(moa_lemmas, as.data.frame(all_lemmas), all = F)
colnames(collex_input) <- c("Lemma", "cxn_freq", "cxn_all")
collex_input <- subset(collex_input, cxn_all != 0)
left_join(collex(collex_input, corpsize = sum(coca$Freq), delta.p = T),
select(collex(collex_input, corpsize = sum(coca$Freq), am = "odds"), COLLEX, COLL.STR.ODDS)) %>% DT::datatable()## Joining, by = "COLLEX"
# %>% write_excel_csv("simple_collexeme_analysis.csv")
# relative frequency ------------------------------------------------------
# get COCA frequencies
coca_freq <- read_xlsx("../data/COCA2017_total_frequencies.xlsx")
# tabulate mother frequency
moa_tbl <- table(moa$Year) %>% as.data.frame(stringsAsFactors = F)
colnames(moa_tbl) <- c("YEAR", "Freq")
moa_tbl$YEAR <- as.numeric(moa_tbl$YEAR)
moa_tbl <- left_join(coca_freq, moa_tbl, by = "YEAR")
moa_tbl$pmw <- (moa_tbl$Freq / moa_tbl$TOTAL) * 1e06
# plot
# png("mother_of_all_coca_freq.png", width = 6.5, height = 5, un = "in", res = 300)
plot(moa_tbl$YEAR, moa_tbl$pmw, pch = 20, col = "blue",
ylab = "Frequency per million words", xlab = "Year",
main = "Token and type frequencies")
#main = expression(paste("[", italic("mother of all"), " X], COCA")))
abline(lm(moa_tbl$pmw ~ moa_tbl$YEAR), lty = 2, col = "darkgrey")# dev.off()
# types per decade
moa_types <- moa %>% group_by(Decade) %>% summarise(
types = length(unique(lemma)),
n = n()
)
# types per year
moa_types_year <- moa %>% group_by(Year) %>% summarise(
types = length(unique(lemma))
)
# add to table with total frequencies
moa_tbl <- left_join(moa_types_year, moa_tbl, by = c("Year" = "YEAR"))
moa_tbl <- rename(moa_tbl, "YEAR" = "Year")
moa_tbl$types_pmw <- (moa_tbl$types / moa_tbl$TOTAL) * 1e6
# coca_freq per dcade
coca_freq$Decade <- floor(coca_freq$YEAR/10)*10
coca_freq_decade <- coca_freq %>% group_by(Decade) %>% summarise(
n = sum(TOTAL)
)
# types per decade
moa_types <- left_join(moa_types, coca_freq_decade)## Joining, by = c("Decade", "n")
moa_types$rel <- moa_types$types / moa_types$n
moa_types$rel %>% plotmoa_types$ttr <- moa_types$types / moa_types$n
# distribution of hapaxes
hapaxes <- table(moa$lemma) %>% as.data.frame %>% filter(Freq==1) %>% select(Var1) %>% as.vector
hapaxes <- as.character(hapaxes$Var1)
moa$hapax <- ifelse(moa$lemma %in% hapaxes, "y", "n")
moa_hapaxes <- moa %>% group_by(Decade) %>% summarise(
hapaxes = length(which(hapax=="y")),
n = n()
)
# plot potential productivity
moa_hapaxes$pp <- moa_hapaxes$hapaxes / moa_hapaxes$n
par(mfrow = c(1,3))
#png("types_tokens_mother_COCA.png", width = 12, height = 4, un = "in", res = 300)
par(mfrow=c(1,3))
par(mar = c(5.1, 5.1, 5.1, 2.1))
plot(moa_tbl$YEAR, moa_tbl$pmw, pch = 20, col = "blue",
ylab = "Frequency per million words", xlab = "Year",
main = "Token and type frequencies",
#main = expression(paste(bold("["), bolditalic("mother of all"), bold(" X], COCA"))),
cex = 2, cex.lab = 2, cex.axis=1.5)
abline(lm(moa_tbl$pmw ~ moa_tbl$YEAR), lty = 2, col = "darkgrey", lwd = 2)
points(moa_tbl$YEAR, moa_tbl$types_pmw, col = rgb(1,0,0,.5), pch = 18, cex = 1.3)
plot(moa_types$Decade, moa_types$ttr,
type = "b", pch=18,
ylab = "Types / Tokens", xlab = "Decade",
main = "Type-Token Ratio, COCA",
lwd = 2, col = "blue", cex = 2, cex.lab = 2, cex.axis=2,
xaxt = "n"
)
axis(1, at = c(1990, 2000, 2010), cex.axis=2)
plot(moa_hapaxes$Decade, moa_hapaxes$pp, type = "b", pch=18,
ylab = "Proportion hapaxes", xlab = "Decade",
main = "Potential productivity \n (proportion of hapax legomena), COCA",
lwd = 2, col = "blue", cex = 2, cex.lab = 2, cex.axis=2, xaxt = "n")
axis(1, at = c(1990, 2000, 2010), cex.axis=2)# dev.off()
par(mar = c(5.1, 4.1, 4.1, 2.1))
par(mfrow=c(1,1))Following Gries (2019), we separate association from frequency by using the log odds ratio as association measure and plotting frequency independently:
(p1 <- collex(collex_input, corpsize = sum(coca$Freq), am = "odds") %>% ggplot(aes(x = log1p(OBS), y = log1p(COLL.STR.ODDS), label = COLLEX, col = log1p(OBS))) + geom_text() + theme_bw() + xlab("Log odds ratio") + ylab("Log Frequency") + scale_color_continuous(low = rgb(0,.7,1,.4), high = "black") + guides(col = 'none') + ggtitle("COCA") +theme(plot.title = element_text(face = "bold", hjust = 0.5)))(p2 <- collex(as.data.frame(d_tbl), corpsize = 1805183579, am = "odds") %>% ggplot(aes(x = log1p(OBS), y = log1p(COLL.STR.ODDS), label = COLLEX, col = log1p(OBS))) + geom_text() + theme_bw() + xlab("Log odds ratio") + ylab("Log Frequency") + scale_color_continuous(low = rgb(0,.7,1,.4), high = "black") + guides(col = 'none') + ggtitle("ENCOW") + theme(plot.title = element_text(face = "bold", hjust = 0.5)))## Warning in log1p(COLL.STR.ODDS): NaNs produced
## Warning in log1p(COLL.STR.ODDS): NaNs produced
## Warning: Removed 2 rows containing missing values (geom_text).
p1 | p2## Warning in log1p(COLL.STR.ODDS): NaNs produced
## Warning in log1p(COLL.STR.ODDS): NaNs produced
## Warning: Removed 2 rows containing missing values (geom_text).
# ggsave("collex_moa_coca_encow.png", height = 7, width = 13)To assess the semantics of the slot fillers in more detail, we use distributional semantics. More specifically, we draw on word2vec. Word2vec is originally the name of a software comprising two main algorithms for representing words in terms of dense vectors, but the term has become virtually synonymous with the approach itself. We use the R package wordVectors, which builds on the original word2vec code. We use the skip-gram approach, which is generally considered to work well with a small amount of training data (see e.g. this tutorial) than the alternative continuous-bag-of-words (cbow) approach. The model was trained based on the first of the 17 downloadable sentence shuffles of ENCOW using the following code:
# read data & export text files without annotation
# Note: the downloadable file containing the first sentence shuffle
# of ENCOW has been split in c. 100 parts, named xaa, xab etc. etc.,
# hence the pattern in the list.file command below. The algorithm
# produces a txt file containing only the words, without annotations
f <- list.files("/Volumes/INTENSO/Corpora/ENCOW/", pattern = "^x..", full.names = T)
for(i in 1:length(f)) {
d <- vroom_lines(f[i])
d <- gsub("^<.*|\t.*", "", d)
d <- d[d!=""]
vroom_write_lines(d, paste0("/Volumes/INTENSO/Corpora/ENCOW/encow_word2vec_training/encow16b_words_for_training001", i, ".txt"))
print(i)
}
# Next, the prep_word2vec command from the WordVectors package is used
# to prepare the training file, containing just the words in lowercase,
# without punctuation. The model is then trained on the basis of this
# file and exported.
prep_word2vec(origin="/Volumes/INTENSO/Corpora/ENCOW/encow_word2vec_training/words", destination="/Volumes/INTENSO/Corpora/ENCOW/encowa_w2v_words.txt", lowercase=T,bundle_ngrams=1)
# train model:
model <- train_word2vec("/Volumes/INTENSO/Corpora/ENCOW/encowa_w2v_words.txt", output_file = "/Volumes/INTENSO/Corpora/ENCOW/encow_vectors_word_based.bin",vectors=100,threads=4,window=5,iter=3,negative_samples=5)
# export:
write_rds(model, "model.Rds")We use this model to visualize the semantic proximity of the slot fillers. For dimensionality reduction, we use both Multidimensional Scaling (MDS) and t-distributed Stochastic Neighbor Embedding (t-SNE). (Only the latter is reported in the paper as it yields more convincing result; we initially worked with MDS and included the results here for the sake of completeness, and to allow for a comparison between the different results.)
# import the model
model <- readRDS("/Users/stefanhartmann/sciebo/Projekte/snowclones/word2vec/model.Rds")
# matrix of terms occurring in [mother of all X]'s open slot
cosine_dist_matrix <- cosineDist(model[[moa_lemmas$Lemma, average = FALSE]], model[[moa_lemmas$Lemma, average = FALSE]])
# multidimensional scaling
cosine_dists <- cosine_dist_matrix %>% cmdscale() %>% as.data.frame() %>% rownames_to_column() %>% setNames(c("lemma", "V1", "V2"))
# alternative: t-SNE
cosine_rtsne <- cosine_dist_matrix %>% Rtsne::Rtsne()
# we use Partitioning Around Medioids (PAM) to
# identify a small number of clusters (here: 3).
# As the results are not really meaningful, we have
# refrained from including it in the final analysis though.
# get PAM clusters
# for(i in 2:10) {
# print(pam(cosine_dists, i)$silinfo$avg.width)
# }
#
pams <- pam(cosine_dists, 3)$clustering
# add frequency information
moa_freqs <- moa$lemma %>% table %>% as_tibble() %>% setNames(c("lemma", "n"))
# combine with MDS results
cosine_dists <- left_join(cosine_dists, moa_freqs)## Joining, by = "lemma"
# add log frequency
cosine_dists$LogFreq <- log1p(cosine_dists$n)
# 3 clusters
cosine_dists$clusters <- pams
# add Rtsne
cosine_dists <- cbind(cosine_dists, setNames(as.data.frame(cosine_rtsne$Y), c("dim1", "dim2")))
# visualize
# add one column that only serves to increase the font
# size of the remaining items (only for print version)
cosine_dists <- rbind(cosine_dists,
data.frame(lemma = "",
V1 = 0,
V2 = 0,
n = 0,
LogFreq = 0.3,
clusters = 1,
dim1 = 0, dim2 = 0))
set.seed(1994)
ggplot(cosine_dists, aes(x = V1, y = V2, label = lemma, size = LogFreq, col = factor(clusters))) +
geom_text_repel(max.overlaps = 15) +
guides(col = "none", size = "none") + theme_bw() +
# theme(axis.text = element_text(size = 18)) +
# theme(axis.title = element_text(size = 18)) +
# theme(strip.text = element_text(size = 18)) +
# theme(legend.title = element_text(size = 18, face = "bold")) +
# theme(text = element_text(size = 18)) +
scale_color_viridis_d() + ylab("dim2") + xlab("dim1")## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# ggsave("distsem_moa_word2vec.png", width = 7, height = 6, dpi=500)
# use RTsne instead
set.seed(1994)
ggplot(cosine_dists, aes(x = dim1, y = dim2, label = lemma, size = LogFreq)) +
geom_text_repel(max.overlaps = 15) +
guides(size = "none") + theme_bw() +
# theme(axis.text = element_text(size = 18)) +
# theme(axis.title = element_text(size = 18)) +
# theme(strip.text = element_text(size = 18)) +
# theme(legend.title = element_text(size = 18, face = "bold")) +
# theme(text = element_text(size = 18)) +
scale_color_viridis_d() + ylab("dim2") + xlab("dim1")# ggsave("distsem_moa_word2vec_tsne.png", width = 7, height = 6, dpi=500)Finally, we compute the semantic distance between mother and all X elements, following the suggestion of a reviewer (thank you!):
mother <- cosineDist(model[[c("mother", moa_lemmas$Lemma), average = FALSE]], model[[c("mother", moa_lemmas$Lemma), average = FALSE]])
mother <- as.data.frame(mother)
png("mothervsrest.png", width = 6.5, height = 5, un = "in", res = 300)
mother[,which(colnames(mother)=="mother")] %>% hist(main = expression(paste("Cosine distance between ", italic("mother "), "and all X items")))
dev.off()## quartz_off_screen
## 2